Jason M. Felice wrote:

This patch adds the --link-by-hash=DIR option, which hard links received
files in a link farm arranged by MD4 file hash.  The result is that the system
will only store one copy of the unique contents of each file, regardless of
the file's name.

To use this patch, run these commands for a successful build:

    patch -p1 <patches/link-by-hash.diff
    ./prepare-source
    ./configure
    make

based-on: 1ddcdaf3f6808ba53aef9e19f630a18808de22ac
diff --git a/Makefile.in b/Makefile.in
--- a/Makefile.in
+++ b/Makefile.in
@@ -36,7 +36,7 @@ OBJS1=flist.o rsync.o generator.o receiver.o cleanup.o sender.o exclude.o \
 	util.o main.o checksum.o match.o syscall.o log.o backup.o
 OBJS2=options.o io.o compat.o hlink.o token.o uidlist.o socket.o hashtable.o \
 	fileio.o batch.o clientname.o chmod.o acls.o xattrs.o
-OBJS3=progress.o pipe.o
+OBJS3=progress.o pipe.o hashlink.o
 DAEMON_OBJ = params.o loadparm.o clientserver.o access.o connection.o authenticate.o
 popt_OBJS=popt/findme.o  popt/popt.o  popt/poptconfig.o \
 	popt/popthelp.o popt/poptparse.o
diff --git a/flist.c b/flist.c
--- a/flist.c
+++ b/flist.c
@@ -70,6 +70,7 @@ extern int unsort_ndx;
 extern uid_t our_uid;
 extern struct stats stats;
 extern char *filesfrom_host;
+extern char *link_by_hash_dir;
 
 extern char curr_dir[MAXPATHLEN];
 
@@ -854,7 +855,7 @@ static struct file_struct *recv_file_entry(int f, struct file_list *flist, int x
 		extra_len += EXTRA_LEN;
 #endif
 
-	if (always_checksum && S_ISREG(mode))
+	if ((always_checksum || link_by_hash_dir) && S_ISREG(mode))
 		extra_len += SUM_EXTRA_CNT * EXTRA_LEN;
 
 #if SIZEOF_INT64 >= 8
diff --git a/hashlink.c b/hashlink.c
new file mode 100644
--- /dev/null
+++ b/hashlink.c
@@ -0,0 +1,340 @@
+/*
+   Copyright (C) Cronosys, LLC 2004
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+*/
+
+/* This file contains code used by the --link-by-hash option. */
+
+#include "rsync.h"
+
+extern char *link_by_hash_dir;
+
+#ifdef HAVE_LINK
+
+char *make_hash_name(struct file_struct *file)
+{
+	char hash[4*2 + 1 + 12*2 + 1], *dst;
+	uchar c, *src = (uchar*)F_SUM(file);
+	int i;
+
+	for (dst = hash, i = 0; i < 4; i++, src++) {
+		c = *src >> 4;
+		*(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
+		c = *src & 0x0f;
+		*(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
+	}
+	*dst++ = '/';
+	for (i = 0; i < 12; i++, src++) {
+		c = *src >> 4;
+		*(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
+		c = *src & 0x0f;
+		*(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
+	}
+	*dst = '\0';
+
+	if (asprintf(&dst, "%s/%s", link_by_hash_dir, hash) < 0)
+		out_of_memory("make_hash_name");
+
+	return dst;
+}
+
+
+void kill_hashfile(struct hashfile_struct *hashfile)
+{
+	if (!hashfile)
+		return;
+	free(hashfile->name);
+	close(hashfile->fd);
+	free(hashfile);
+}
+
+
+void kill_hashfiles(struct hashfile_struct *hashfiles)
+{
+	struct hashfile_struct *iter, *next;
+	if ((iter = hashfiles) != NULL) {
+		do {
+			next = iter->next;
+			kill_hashfile(iter);
+			iter = next;
+		} while (iter != hashfiles);
+	}
+}
+
+
+struct hashfile_struct *find_hashfiles(char *hashname, int64 size, long *fnbr)
+{
+	DIR *d;
+	struct dirent *di;
+	struct hashfile_struct *hashfiles = NULL, *hashfile;
+	STRUCT_STAT st;
+	long this_fnbr;
+
+	*fnbr = 0;
+
+	/* Build a list of potential candidates and open
+	 * them. */
+	if ((d = opendir(hashname)) == NULL) {
+		rsyserr(FERROR, errno, "opendir failed: \"%s\"", hashname);
+		free(hashname);
+		return NULL;
+	}
+	while ((di = readdir(d)) != NULL) {
+		if (!strcmp(di->d_name,".") || !strcmp(di->d_name,"..")) {
+			continue;
+		}
+
+		/* We need to have the largest fnbr in case we need to store
+		 * a new file. */
+		this_fnbr = atol(di->d_name);
+		if (this_fnbr > *fnbr)
+			*fnbr = this_fnbr;
+
+		hashfile = new_array(struct hashfile_struct, 1);
+		if (asprintf(&hashfile->name,"%s/%s",hashname, di->d_name) < 0)
+			out_of_memory("find_hashfiles");
+		if (do_stat(hashfile->name,&st) == -1) {
+			rsyserr(FERROR, errno, "stat failed: %s", hashfile->name);
+			kill_hashfile(hashfile);
+			continue;
+		}
+		if (st.st_size != size) {
+			kill_hashfile(hashfile);
+			continue;
+		}
+		hashfile->nlink = st.st_nlink;
+		hashfile->fd = open(hashfile->name,O_RDONLY|O_BINARY);
+		if (hashfile->fd == -1) {
+			rsyserr(FERROR, errno, "open failed: %s", hashfile->name);
+			kill_hashfile(hashfile);
+			continue;
+		}
+		if (hashfiles == NULL)
+			hashfiles = hashfile->next = hashfile->prev = hashfile;
+		else {
+			hashfile->next = hashfiles;
+			hashfile->prev = hashfiles->prev;
+			hashfile->next->prev = hashfile;
+			hashfile->prev->next = hashfile;
+		}
+	}
+	closedir(d);
+
+	return hashfiles;
+}
+
+
+struct hashfile_struct *compare_hashfiles(int fd,struct hashfile_struct *files)
+{
+	int amt, hamt;
+	char buffer[BUFSIZ], cmpbuffer[BUFSIZ];
+	struct hashfile_struct *iter, *next, *best;
+	uint32 nlink;
+
+	if (!files)
+		return NULL;
+
+	iter = files; /* in case files are 0 bytes */
+	while ((amt = read(fd, buffer, BUFSIZ)) > 0) {
+		iter = files;
+		do {
+			/* Icky bit to resync when we steal the first node. */
+			if (!files)
+				files = iter;
+
+			next = iter->next;
+
+			hamt = read(iter->fd, cmpbuffer, BUFSIZ);
+			if (amt != hamt || memcmp(buffer, cmpbuffer, amt)) {
+				if (iter == files) {
+					files = files->prev;
+				}
+				if (iter->next == iter) {
+					files = next = NULL;
+				} else {
+					next = iter->next;
+					if (iter == files) {
+						/* So we know to resync */
+						files = NULL;
+					}
+				}
+				iter->next->prev = iter->prev;
+				iter->prev->next = iter->next;
+				kill_hashfile(iter);
+			}
+
+			iter = next;
+		} while (iter != files);
+
+		if (iter == NULL && files == NULL) {
+			/* There are no matches. */
+			return NULL;
+		}
+	}
+
+	if (amt == -1) {
+		rsyserr(FERROR, errno, "read failed in compare_hashfiles()");
+		kill_hashfiles(files);
+		return NULL;
+	}
+
+	/* If we only have one file left, use it. */
+	if (files == files->next) {
+		return files;
+	}
+
+	/* All files which remain in the list are identical and should have
+	 * the same size.  We pick the one with the lowest link count (we
+	 * may have rolled over because we hit the maximum link count for
+	 * the filesystem). */
+	best = iter = files;
+	nlink = iter->nlink;
+	do {
+		if (iter->nlink < nlink) {
+			nlink = iter->nlink;
+			best = iter;
+		}
+		iter = iter->next;
+	} while (iter != files);
+
+	best->next->prev = best->prev;
+	best->prev->next = best->next;
+	if (files == best)
+		files = files->next;
+	kill_hashfiles(files);
+	return best;
+}
+
+
+int link_by_hash(const char *fnametmp, const char *fname, struct file_struct *file)
+{
+	STRUCT_STAT st;
+	char *hashname = make_hash_name(file);
+	int first = 0, rc;
+	char *linkname;
+	long last_fnbr;
+
+	if (F_LENGTH(file) == 0)
+		return robust_rename(fnametmp, fname, NULL, 0644);
+
+	if (do_stat(hashname, &st) == -1) {
+		char *dirname;
+
+		/* Directory does not exist. */
+		dirname = strdup(hashname);
+		*strrchr(dirname,'/') = 0;
+		if (do_mkdir(dirname, 0755) == -1 && errno != EEXIST) {
+			rsyserr(FERROR, errno, "mkdir failed: %s", dirname);
+			free(hashname);
+			free(dirname);
+			return robust_rename(fnametmp, fname, NULL, 0644);
+		}
+		free(dirname);
+
+		if (do_mkdir(hashname, 0755) == -1 && errno != EEXIST) {
+			rsyserr(FERROR, errno, "mkdir failed: %s", hashname);
+			free(hashname);
+			return robust_rename(fnametmp, fname, NULL, 0644);
+		}
+
+		first = 1;
+		if (asprintf(&linkname,"%s/0",hashname) < 0)
+			out_of_memory("link_by_hash");
+		rprintf(FINFO, "(1) linkname = %s\n", linkname);
+	} else {
+		struct hashfile_struct *hashfiles, *hashfile;
+
+		if (do_stat(fnametmp,&st) == -1) {
+			rsyserr(FERROR, errno, "stat failed: %s", fname);
+			return -1;
+		}
+		hashfiles = find_hashfiles(hashname, st.st_size, &last_fnbr);
+
+		if (hashfiles == NULL) {
+			first = 1;
+			if (asprintf(&linkname,"%s/0",hashname) < 0)
+				out_of_memory("link_by_hash");
+			rprintf(FINFO, "(2) linkname = %s\n", linkname);
+		} else {
+			int fd;
+			/* Search for one identical to us. */
+			if ((fd = open(fnametmp,O_RDONLY|O_BINARY)) == -1) {
+				rsyserr(FERROR, errno, "open failed: %s", fnametmp);
+				kill_hashfiles(hashfiles);
+				return -1;
+			}
+			hashfile = compare_hashfiles(fd, hashfiles);
+			hashfiles = NULL;
+			close(fd);
+
+			if (hashfile) {
+				first = 0;
+				linkname = strdup(hashfile->name);
+				rprintf(FINFO, "(3) linkname = %s\n", linkname);
+				kill_hashfile(hashfile);
+			} else {
+				first = 1;
+				if (asprintf(&linkname, "%s/%ld", hashname, last_fnbr + 1) < 0)
+					out_of_memory("link_by_hash");
+				rprintf(FINFO, "(4) linkname = %s\n", linkname);
+			}
+		}
+	}
+
+	if (!first) {
+		rprintf(FINFO, "link-by-hash (existing): \"%s\" -> %s\n",
+				linkname, full_fname(fname));
+		robust_unlink(fname);
+		rc = do_link(linkname, fname);
+		if (rc == -1) {
+			if (errno == EMLINK) {
+				first = 1;
+				free(linkname);
+				if (asprintf(&linkname,"%s/%ld",hashname, last_fnbr + 1) < 0)
+					out_of_memory("link_by_hash");
+				rprintf(FINFO, "(5) linkname = %s\n", linkname);
+				rprintf(FINFO,"link-by-hash: max link count exceeded, starting new file \"%s\".\n", linkname);
+			} else {
+				rsyserr(FERROR, errno, "link \"%s\" -> \"%s\"",
+					linkname, full_fname(fname));
+				rc = robust_rename(fnametmp, fname, NULL, 0644);
+			}
+		} else {
+			do_unlink(fnametmp);
+		}
+	}
+
+	if (first) {
+		rprintf(FINFO, "link-by-hash (new): %s -> \"%s\"\n",
+				full_fname(fname),linkname);
+
+		rc = robust_rename(fnametmp, fname, NULL, 0644);
+		if (rc != 0) {
+			rsyserr(FERROR, errno, "rename \"%s\" -> \"%s\"",
+				full_fname(fnametmp), full_fname(fname));
+		}
+		rc = do_link(fname,linkname);
+		if (rc != 0) {
+			rsyserr(FERROR, errno, "link \"%s\" -> \"%s\"",
+				full_fname(fname), linkname);
+		}
+	}
+
+	free(linkname);
+	free(hashname);
+	return rc;
+}
+#endif
diff --git a/options.c b/options.c
--- a/options.c
+++ b/options.c
@@ -155,6 +155,7 @@ char *backup_suffix = NULL;
 char *tmpdir = NULL;
 char *partial_dir = NULL;
 char *basis_dir[MAX_BASIS_DIRS+1];
+char *link_by_hash_dir = NULL;
 char *config_file = NULL;
 char *shell_cmd = NULL;
 char *logfile_name = NULL;
@@ -393,6 +394,7 @@ void usage(enum logcode F)
   rprintf(F,"     --compare-dest=DIR      also compare destination files relative to DIR\n");
   rprintf(F,"     --copy-dest=DIR         ... and include copies of unchanged files\n");
   rprintf(F,"     --link-dest=DIR         hardlink to files in DIR when unchanged\n");
+  rprintf(F,"     --link-by-hash=DIR      create hardlinks by hash into DIR\n");
   rprintf(F," -z, --compress              compress file data during the transfer\n");
   rprintf(F,"     --compress-level=NUM    explicitly set compression level\n");
   rprintf(F,"     --skip-compress=LIST    skip compressing files with a suffix in LIST\n");
@@ -445,7 +447,7 @@ enum {OPT_VERSION = 1000, OPT_DAEMON, OPT_SENDER, OPT_EXCLUDE, OPT_EXCLUDE_FROM,
       OPT_FILTER, OPT_COMPARE_DEST, OPT_COPY_DEST, OPT_LINK_DEST, OPT_HELP,
       OPT_INCLUDE, OPT_INCLUDE_FROM, OPT_MODIFY_WINDOW, OPT_MIN_SIZE, OPT_CHMOD,
       OPT_READ_BATCH, OPT_WRITE_BATCH, OPT_ONLY_WRITE_BATCH, OPT_MAX_SIZE,
-      OPT_NO_D, OPT_APPEND, OPT_NO_ICONV,
+      OPT_NO_D, OPT_APPEND, OPT_NO_ICONV, OPT_LINK_BY_HASH,
       OPT_SERVER, OPT_REFUSED_BASE = 9000};
 
 static struct poptOption long_options[] = {
@@ -577,6 +579,7 @@ static struct poptOption long_options[] = {
   {"compare-dest",     0,  POPT_ARG_STRING, 0, OPT_COMPARE_DEST, 0, 0 },
   {"copy-dest",        0,  POPT_ARG_STRING, 0, OPT_COPY_DEST, 0, 0 },
   {"link-dest",        0,  POPT_ARG_STRING, 0, OPT_LINK_DEST, 0, 0 },
+  {"link-by-hash",     0,  POPT_ARG_STRING, 0, OPT_LINK_BY_HASH, 0, 0},
   {"fuzzy",           'y', POPT_ARG_VAL,    &fuzzy_basis, 1, 0, 0 },
   {"no-fuzzy",         0,  POPT_ARG_VAL,    &fuzzy_basis, 0, 0, 0 },
   {"no-y",             0,  POPT_ARG_VAL,    &fuzzy_basis, 0, 0, 0 },
@@ -1259,6 +1262,21 @@ int parse_arguments(int *argc_p, const char ***argv_p)
 			return 0;
 #endif
 
+                case OPT_LINK_BY_HASH:
+#ifdef HAVE_LINK
+			arg = poptGetOptArg(pc);
+			if (sanitize_paths)
+				arg = sanitize_path(NULL, arg, NULL, 0, SP_DEFAULT);
+			link_by_hash_dir = (char *)arg;
+			break;
+#else
+			snprintf(err_buf, sizeof err_buf,
+				 "hard links are not supported on this %s\n",
+				 am_server ? "server" : "client");
+			rprintf(FERROR, "ERROR: %s", err_buf);
+			return 0;
+#endif
+
 		default:
 			/* A large opt value means that set_refuse_options()
 			 * turned this option off. */
@@ -2049,6 +2067,11 @@ void server_options(char **args, int *argc_p)
 	} else if (inplace)
 		args[ac++] = "--inplace";
 
+	if (link_by_hash_dir && am_sender) {
+		args[ac++] = "--link-by-hash";
+		args[ac++] = link_by_hash_dir;
+	}
+
 	if (files_from && (!am_sender || filesfrom_host)) {
 		if (filesfrom_host) {
 			args[ac++] = "--files-from";
diff --git a/receiver.c b/receiver.c
--- a/receiver.c
+++ b/receiver.c
@@ -183,12 +183,14 @@ int open_tmpfile(char *fnametmp, const char *fname, struct file_struct *file)
 }
 
 static int receive_data(int f_in, char *fname_r, int fd_r, OFF_T size_r,
-			const char *fname, int fd, OFF_T total_size)
+			const char *fname, int fd, OFF_T total_size,
+			const char *md4)
 {
 	static char file_sum1[MAX_DIGEST_LEN];
 	static char file_sum2[MAX_DIGEST_LEN];
 	struct map_struct *mapbuf;
 	struct sum_struct sum;
+	md_context mdfour_data;
 	int32 len, sum_len;
 	OFF_T offset = 0;
 	OFF_T offset2;
@@ -208,6 +210,9 @@ static int receive_data(int f_in, char *fname_r, int fd_r, OFF_T size_r,
 	} else
 		mapbuf = NULL;
 
+	if (md4)
+		mdfour_begin(&mdfour_data);
+
 	sum_init(checksum_seed);
 
 	if (append_mode > 0) {
@@ -252,6 +257,8 @@ static int receive_data(int f_in, char *fname_r, int fd_r, OFF_T size_r,
 			cleanup_got_literal = 1;
 
 			sum_update(data, i);
+			if (md4)
+				mdfour_update(&mdfour_data, (uchar*)data, i);
 
 			if (fd != -1 && write_file(fd,data,i) != i)
 				goto report_write_error;
@@ -279,6 +286,8 @@ static int receive_data(int f_in, char *fname_r, int fd_r, OFF_T size_r,
 
 			see_token(map, len);
 			sum_update(map, len);
+			if (md4)
+				mdfour_update(&mdfour_data, (uchar*)map, len);
 		}
 
 		if (updating_basis_or_equiv) {
@@ -323,6 +332,8 @@ static int receive_data(int f_in, char *fname_r, int fd_r, OFF_T size_r,
 	}
 
 	sum_len = sum_end(file_sum1);
+	if (md4)
+		mdfour_result(&mdfour_data, (uchar*)md4);
 
 	if (mapbuf)
 		unmap_file(mapbuf);
@@ -338,7 +349,7 @@ static int receive_data(int f_in, char *fname_r, int fd_r, OFF_T size_r,
 
 static void discard_receive_data(int f_in, OFF_T length)
 {
-	receive_data(f_in, NULL, -1, 0, NULL, -1, length);
+	receive_data(f_in, NULL, -1, 0, NULL, -1, length, NULL);
 }
 
 static void handle_delayed_updates(char *local_name)
@@ -740,7 +751,7 @@ int recv_files(int f_in, char *local_name)
 
 		/* recv file data */
 		recv_ok = receive_data(f_in, fnamecmp, fd1, st.st_size,
-				       fname, fd2, F_LENGTH(file));
+				       fname, fd2, F_LENGTH(file), F_SUM(file));
 
 		log_item(log_code, file, &initial_stats, iflags, NULL);
 
diff --git a/rsync.c b/rsync.c
--- a/rsync.c
+++ b/rsync.c
@@ -47,6 +47,7 @@ extern int inplace;
 extern int flist_eof;
 extern int keep_dirlinks;
 extern int make_backups;
+extern char *link_by_hash_dir;
 extern struct file_list *cur_flist, *first_flist, *dir_flist;
 extern struct chmod_mode_struct *daemon_chmod_modes;
 #ifdef ICONV_OPTION
@@ -583,8 +584,15 @@ int finish_transfer(const char *fname, const char *fnametmp,
 	/* move tmp file over real file */
 	if (verbose > 2)
 		rprintf(FINFO, "renaming %s to %s\n", fnametmp, fname);
-	ret = robust_rename(fnametmp, fname, temp_copy_name,
-			    file->mode & INITACCESSPERMS);
+#ifdef HAVE_LINK
+	if (link_by_hash_dir)
+		ret = link_by_hash(fnametmp, fname, file);
+	else
+#endif
+	{
+		ret = robust_rename(fnametmp, fname, temp_copy_name,
+				    file->mode & INITACCESSPERMS);
+	}
 	if (ret < 0) {
 		rsyserr(FERROR_XFER, errno, "%s %s -> \"%s\"",
 			ret == -2 ? "copy" : "rename",
diff --git a/rsync.h b/rsync.h
--- a/rsync.h
+++ b/rsync.h
@@ -850,6 +850,14 @@ struct stats {
 	int num_transferred_files;
 };
 
+struct hashfile_struct {
+	struct hashfile_struct *next;
+	struct hashfile_struct *prev;
+	char *name;
+	int fd;
+	uint32 nlink;
+};
+
 struct chmod_mode_struct;
 
 struct flist_ndx_item {
diff --git a/rsync.yo b/rsync.yo
--- a/rsync.yo
+++ b/rsync.yo
@@ -392,6 +392,7 @@ to the detailed description below for a complete description.  verb(
      --compare-dest=DIR      also compare received files relative to DIR
      --copy-dest=DIR         ... and include copies of unchanged files
      --link-dest=DIR         hardlink to files in DIR when unchanged
+     --link-by-hash=DIR      create hardlinks by hash into DIR
  -z, --compress              compress file data during the transfer
      --compress-level=NUM    explicitly set compression level
      --skip-compress=LIST    skip compressing files with suffix in LIST
diff -up a/proto.h b/proto.h
--- a/proto.h
+++ b/proto.h
@@ -108,6 +108,12 @@ void itemize(const char *fnamecmp, struc
 int unchanged_file(char *fn, struct file_struct *file, STRUCT_STAT *st);
 void check_for_finished_files(int itemizing, enum logcode code, int check_redo);
 void generate_files(int f_out, const char *local_name);
+char *make_hash_name(struct file_struct *file);
+void kill_hashfile(struct hashfile_struct *hashfile);
+void kill_hashfiles(struct hashfile_struct *hashfiles);
+struct hashfile_struct *find_hashfiles(char *hashname, int64 size, long *fnbr);
+struct hashfile_struct *compare_hashfiles(int fd,struct hashfile_struct *files);
+int link_by_hash(const char *fnametmp, const char *fname, struct file_struct *file);
 struct hashtable *hashtable_create(int size, int key64);
 void hashtable_destroy(struct hashtable *tbl);
 void *hashtable_find(struct hashtable *tbl, int64 key, int allocate_if_missing);
diff -up a/rsync.1 b/rsync.1
--- a/rsync.1
+++ b/rsync.1
@@ -467,6 +467,7 @@ to the detailed description below for a
      \-\-compare\-dest=DIR      also compare received files relative to DIR
      \-\-copy\-dest=DIR         ... and include copies of unchanged files
      \-\-link\-dest=DIR         hardlink to files in DIR when unchanged
+     \-\-link\-by\-hash=DIR      create hardlinks by hash into DIR
  \-z, \-\-compress              compress file data during the transfer
      \-\-compress\-level=NUM    explicitly set compression level
      \-\-skip\-compress=LIST    skip compressing files with suffix in LIST
